##Loading Libraries
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ stringr 1.4.0
## ✓ tidyr 1.0.2 ✓ forcats 0.4.0
## ✓ readr 1.3.1
## ── Conflicts ────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(edgeR)
## Warning: package 'edgeR' was built under R version 3.6.1
## Loading required package: limma
library(pheatmap)
library(ggplot2)
library(ggbiplot)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
#Read in Data and Prepare Data Frames
demo<-read.csv("/Users/simransamra/R/git_temp/Repo_team_Genome-Surfers_W2020/data/raw_data/tcga_paad_clinical.csv", header = T)
load("/Users/simransamra/R/git_temp/Repo_team_Genome-Surfers_W2020/data/raw_data/tcga_paad.RData")
#Melt data
tcga$gene <- rownames(tcga)
dataMelt<-melt(tcga, id.vars = "gene", var = "Samples")
#Set up data
demo <- demo %>% select(c(submitter_id, age_at_index,year_of_birth, year_of_death, vital_status, race, gender, ajcc_pathologic_m, ajcc_pathologic_n,ajcc_pathologic_t, ajcc_pathologic_stage))
tcgaT <- as.data.frame(t(as.matrix(tcga)))
tcgaN <- tibble::rownames_to_column(tcgaT, "submitter_id")
demo$submitter_id <- as.factor(demo$submitter_id)
data <- right_join(x = tcgaN, y = demo, by = "submitter_id")
## Warning: Column `submitter_id` joining character vector and factor, coercing
## into character vector
#Density Plot - Age at Index vs Vital Status
ggplot(data, aes(x = age_at_index, colour=vital_status)) +
geom_density() +
labs(title = "Age at Index and Vital Status",
color = "Vital Status", x = "Age at Index", y = "Density") +
theme(
plot.title = element_text(color = "blue", size = 12, face = "bold")) +
scale_fill_manual(values = c("darkblue", "darkred"))
#Density Plot - Age at Index vs Pathologic Stage M
ggplot(data, aes(x = age_at_index, colour=ajcc_pathologic_m)) +
geom_density() +
labs(title = "AJCC Pathologic M",
subtitle = "AJCC TNM system: (M) Classifies cancers by the presence or absence of distant metastases", x = "Age at Index", y = "Density", color = "AJCC Pathologic M", caption = "M0:No evidence of distant metastasis; M1: Distant metastasis; MX: Unknown distant metastasis status") +
theme(
plot.title = element_text(color = "blue", size = 12, face = "bold"),
plot.subtitle = element_text(color = "black", size = 7),
plot.caption = element_text(color = "black", size = 6, hjust = 0)
)
#Density Plot - Age at Index vs Pathologic Stage N
ggplot(data, aes(x = age_at_index, colour=ajcc_pathologic_n)) +
geom_density() +
labs(title = "AJCC Pathologic N",
subtitle = "AJCC TNM system: (N) Describes involvement of regional lymph nodes", x = "Age at Index", y = "Density", color = "AJCC Pathologic N", caption = "--: Not reported; N0: No regional lymph node metastasis; N1: Regional lymph node metastasis; \n N1b: Metastasis in multiple regional lymph nodes; NX: Metastasis cannot be assessed") +
theme(
plot.title = element_text(color = "blue", size = 12, face = "bold"),
plot.subtitle = element_text(color = "black", size = 7),
plot.caption = element_text(color = "black", size = 6, hjust = 0)
)
#Density Plot - Age at Index vs Pathologic Stage T
ggplot(data, aes(x = age_at_index, colour=ajcc_pathologic_t)) +
geom_density() +
labs(title = "AJCC Pathologic T",
subtitle = "AJCC TNM system: (T) ", x = "Age at Index", y = "Density", color = "AJCC Pathologic T", caption = "--: Not reported; T1: Tumor limited to the pancreas (2 cm or less in greatest dimension); T2: Tumor limited to \n the pancreas (greater than 2 cm in greatest dimension); T3: Tumor extends beyond pancrease, but without \n the involvement of coeliac axis or superior mesenteric artery; T4: Tumor involves coeliac axis or superior \n mesenteric artery; TX: Tumor cannot be assessed )
") +
theme(
plot.title = element_text(color = "blue", size = 12, face = "bold"),
plot.subtitle = element_text(color = "black", size = 7),
plot.caption = element_text(color = "black", size = 6, hjust = 0)
)
Definition of the different pathologic stages:
#Density Plot - Age at Index vs Pathologic Stage
ggplot(data, aes(x = age_at_index, colour=ajcc_pathologic_stage)) +
geom_density() +
labs(title = "AJCC Pathologic Stage", x = "Age at Index", y = "Density", color = "AJCC Pathologic Stage") +
theme(
plot.title = element_text(color = "blue", size = 12, face = "bold"),
plot.subtitle = element_text(color = "black", size = 7),
plot.caption = element_text(color = "black", size = 6, hjust = 0)
)
#Boxplot - Distribution of Gene Expression
#Boxplot - Distribution of Gene Expression
ggplot(dataMelt, aes(x=Samples, y=value)) +
geom_boxplot() +
xlab("Samples") +
ylab("Expression (Log_2_ Transformed)")+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Distribution of Gene Expression Across All 177 Samples")+
theme(plot.title = element_text(hjust = 0.5))
#Density Plot
#Density Plot
ggplot(dataMelt, aes(value, color = Samples)) +
geom_density() +
xlab("Expression (Log2 Transformed)") +
ylab("Density")+
ggtitle("Distribution of Gene Expression Across All 177 Samples") +
theme(plot.title = element_text(hjust = 0.5)) + theme(legend.position="none")
#Heatmaps: Show Correlation
#Prepare Data
load("/Users/simransamra/R/git_temp/Repo_team_Genome-Surfers_W2020/data/raw_data/tcga_paad.RData")
#tcga$gene <- rownames(tcga)
demoHeat<-demo
toDelete <- seq(1, nrow(demoHeat), 2)
demoHeat<-demoHeat[toDelete ,]
designFactors <- as.data.frame(demoHeat[, c("ajcc_pathologic_stage", "vital_status")])
rownames(designFactors) <- colnames(tcga)
data.matrix <- cor(tcga)
pheatmap(data.matrix, cluster_rows = T, scale = "none", clustering_method = "average",
clustering_distance_cols = "correlation", show_colnames = T, show_rownames = T,
main = "Clustering Heatmap: Pathologic Stage and Vital Status ", annotation = designFactors, treeheight_col = 35, treeheight_row = 35,
fontsize = 3)
#Prepare Data
load("/Users/simransamra/R/git_temp/Repo_team_Genome-Surfers_W2020/data/raw_data/tcga_paad.RData")
#tcga$gene <- rownames(tcga)
demoHeat<-demo
toDelete <- seq(1, nrow(demoHeat), 2)
demoHeat<-demoHeat[toDelete ,]
designFactors <- as.data.frame(demoHeat[, c("race", "gender")])
rownames(designFactors) <- colnames(tcga)
data.matrix <- cor(tcga)
pheatmap(data.matrix, cluster_rows = T, scale = "none", clustering_method = "average",
clustering_distance_cols = "correlation", show_colnames = T, show_rownames = T,
main = "Clustering Heatmap: Gender and Race", annotation = designFactors, treeheight_col = 35, treeheight_row = 35,
fontsize = 3)
#Prepare Data
load("/Users/simransamra/R/git_temp/Repo_team_Genome-Surfers_W2020/data/raw_data/tcga_paad.RData")
#tcga$gene <- rownames(tcga)
demoHeat<-demo
toDelete <- seq(1, nrow(demoHeat), 2)
demoHeat<-demoHeat[toDelete ,]
designFactors <- as.data.frame(demoHeat[, c("ajcc_pathologic_n", "ajcc_pathologic_t", "ajcc_pathologic_m")])
rownames(designFactors) <- colnames(tcga)
data.matrix <- cor(tcga)
pheatmap(data.matrix, cluster_rows = T, scale = "none", clustering_method = "average",
clustering_distance_cols = "correlation", show_colnames = T, show_rownames = T,
main = "Clustering Heatmap: AJCC TNM System", annotation = designFactors, treeheight_col = 35, treeheight_row = 35,
fontsize = 3)
tcgaT<-t(tcga)
tcga.pca <- prcomp(tcgaT, center = TRUE)
summary(tcga.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 7.5906 7.4610 6.2529 4.86404 4.46313 4.11191 3.2606
## Proportion of Variance 0.1376 0.1330 0.0934 0.05652 0.04758 0.04039 0.0254
## Cumulative Proportion 0.1376 0.2706 0.3640 0.42053 0.46811 0.50850 0.5339
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 3.23307 2.66589 2.65305 2.40256 2.37936 2.27504 2.13855
## Proportion of Variance 0.02497 0.01698 0.01681 0.01379 0.01352 0.01236 0.01092
## Cumulative Proportion 0.55887 0.57584 0.59266 0.60645 0.61997 0.63233 0.64326
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 2.03944 1.99613 1.94578 1.88896 1.84567 1.75546 1.74472
## Proportion of Variance 0.00994 0.00952 0.00904 0.00852 0.00814 0.00736 0.00727
## Cumulative Proportion 0.65319 0.66271 0.67176 0.68028 0.68842 0.69578 0.70305
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 1.70065 1.61549 1.59193 1.57297 1.5446 1.48388 1.4753
## Proportion of Variance 0.00691 0.00623 0.00605 0.00591 0.0057 0.00526 0.0052
## Cumulative Proportion 0.70996 0.71619 0.72225 0.72816 0.7339 0.73912 0.7443
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 1.45221 1.43746 1.40615 1.38932 1.36818 1.34369 1.32985
## Proportion of Variance 0.00504 0.00494 0.00472 0.00461 0.00447 0.00431 0.00422
## Cumulative Proportion 0.74935 0.75429 0.75901 0.76362 0.76810 0.77241 0.77663
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 1.31221 1.2937 1.27397 1.25732 1.25202 1.23219 1.22025
## Proportion of Variance 0.00411 0.0040 0.00388 0.00378 0.00374 0.00363 0.00356
## Cumulative Proportion 0.78075 0.7847 0.78862 0.79240 0.79614 0.79977 0.80333
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 1.20476 1.19661 1.1929 1.17625 1.16008 1.14263 1.14034
## Proportion of Variance 0.00347 0.00342 0.0034 0.00331 0.00321 0.00312 0.00311
## Cumulative Proportion 0.80679 0.81021 0.8136 0.81692 0.82013 0.82325 0.82636
## PC50 PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 1.12769 1.12215 1.10669 1.09724 1.0834 1.07069 1.06702
## Proportion of Variance 0.00304 0.00301 0.00293 0.00288 0.0028 0.00274 0.00272
## Cumulative Proportion 0.82940 0.83240 0.83533 0.83821 0.8410 0.84375 0.84647
## PC57 PC58 PC59 PC60 PC61 PC62 PC63
## Standard deviation 1.05512 1.04918 1.03340 1.01957 1.00841 0.99769 0.99255
## Proportion of Variance 0.00266 0.00263 0.00255 0.00248 0.00243 0.00238 0.00235
## Cumulative Proportion 0.84913 0.85176 0.85431 0.85679 0.85922 0.86160 0.86395
## PC64 PC65 PC66 PC67 PC68 PC69 PC70
## Standard deviation 0.97970 0.97378 0.96766 0.95341 0.95272 0.94468 0.93960
## Proportion of Variance 0.00229 0.00227 0.00224 0.00217 0.00217 0.00213 0.00211
## Cumulative Proportion 0.86624 0.86851 0.87075 0.87292 0.87509 0.87722 0.87933
## PC71 PC72 PC73 PC74 PC75 PC76 PC77
## Standard deviation 0.93296 0.92647 0.92585 0.9160 0.90319 0.89812 0.88882
## Proportion of Variance 0.00208 0.00205 0.00205 0.0020 0.00195 0.00193 0.00189
## Cumulative Proportion 0.88141 0.88346 0.88550 0.8875 0.88946 0.89138 0.89327
## PC78 PC79 PC80 PC81 PC82 PC83 PC84
## Standard deviation 0.88479 0.87601 0.87163 0.86513 0.85762 0.85084 0.84765
## Proportion of Variance 0.00187 0.00183 0.00181 0.00179 0.00176 0.00173 0.00172
## Cumulative Proportion 0.89514 0.89697 0.89879 0.90058 0.90233 0.90406 0.90578
## PC85 PC86 PC87 PC88 PC89 PC90 PC91
## Standard deviation 0.84132 0.83277 0.82569 0.82210 0.82080 0.81245 0.80549
## Proportion of Variance 0.00169 0.00166 0.00163 0.00161 0.00161 0.00158 0.00155
## Cumulative Proportion 0.90747 0.90913 0.91076 0.91237 0.91398 0.91556 0.91711
## PC92 PC93 PC94 PC95 PC96 PC97 PC98
## Standard deviation 0.80186 0.79948 0.79645 0.7915 0.78182 0.77279 0.76916
## Proportion of Variance 0.00154 0.00153 0.00152 0.0015 0.00146 0.00143 0.00141
## Cumulative Proportion 0.91864 0.92017 0.92169 0.9232 0.92464 0.92607 0.92748
## PC99 PC100 PC101 PC102 PC103 PC104 PC105
## Standard deviation 0.76758 0.76277 0.75910 0.75578 0.75163 0.74820 0.74532
## Proportion of Variance 0.00141 0.00139 0.00138 0.00136 0.00135 0.00134 0.00133
## Cumulative Proportion 0.92889 0.93028 0.93166 0.93302 0.93437 0.93571 0.93703
## PC106 PC107 PC108 PC109 PC110 PC111 PC112
## Standard deviation 0.74211 0.73388 0.73367 0.72617 0.71914 0.71562 0.71149
## Proportion of Variance 0.00132 0.00129 0.00129 0.00126 0.00124 0.00122 0.00121
## Cumulative Proportion 0.93835 0.93964 0.94092 0.94218 0.94342 0.94464 0.94585
## PC113 PC114 PC115 PC116 PC117 PC118 PC119
## Standard deviation 0.7093 0.70143 0.69659 0.69572 0.69245 0.69013 0.68343
## Proportion of Variance 0.0012 0.00118 0.00116 0.00116 0.00115 0.00114 0.00112
## Cumulative Proportion 0.9470 0.94823 0.94939 0.95054 0.95169 0.95283 0.95394
## PC120 PC121 PC122 PC123 PC124 PC125 PC126
## Standard deviation 0.6787 0.67516 0.67369 0.67091 0.66749 0.66510 0.66058
## Proportion of Variance 0.0011 0.00109 0.00108 0.00108 0.00106 0.00106 0.00104
## Cumulative Proportion 0.9550 0.95613 0.95721 0.95829 0.95935 0.96041 0.96145
## PC127 PC128 PC129 PC130 PC131 PC132 PC133
## Standard deviation 0.65136 0.64932 0.6462 0.64365 0.63658 0.63520 0.63047
## Proportion of Variance 0.00101 0.00101 0.0010 0.00099 0.00097 0.00096 0.00095
## Cumulative Proportion 0.96247 0.96347 0.9645 0.96546 0.96643 0.96739 0.96834
## PC134 PC135 PC136 PC137 PC138 PC139 PC140
## Standard deviation 0.62624 0.62368 0.62138 0.61827 0.61653 0.6152 0.60949
## Proportion of Variance 0.00094 0.00093 0.00092 0.00091 0.00091 0.0009 0.00089
## Cumulative Proportion 0.96928 0.97021 0.97113 0.97204 0.97295 0.9739 0.97474
## PC141 PC142 PC143 PC144 PC145 PC146 PC147
## Standard deviation 0.60762 0.60520 0.59790 0.59652 0.58800 0.58758 0.58335
## Proportion of Variance 0.00088 0.00087 0.00085 0.00085 0.00083 0.00082 0.00081
## Cumulative Proportion 0.97562 0.97650 0.97735 0.97820 0.97903 0.97985 0.98067
## PC148 PC149 PC150 PC151 PC152 PC153 PC154
## Standard deviation 0.58192 0.57633 0.57534 0.57039 0.56453 0.55945 0.55876
## Proportion of Variance 0.00081 0.00079 0.00079 0.00078 0.00076 0.00075 0.00075
## Cumulative Proportion 0.98148 0.98227 0.98306 0.98384 0.98460 0.98535 0.98609
## PC155 PC156 PC157 PC158 PC159 PC160 PC161
## Standard deviation 0.55634 0.55003 0.54940 0.54532 0.5422 0.53841 0.53744
## Proportion of Variance 0.00074 0.00072 0.00072 0.00071 0.0007 0.00069 0.00069
## Cumulative Proportion 0.98683 0.98755 0.98828 0.98899 0.9897 0.99038 0.99107
## PC162 PC163 PC164 PC165 PC166 PC167 PC168
## Standard deviation 0.53566 0.52821 0.52474 0.52011 0.51474 0.51070 0.50689
## Proportion of Variance 0.00069 0.00067 0.00066 0.00065 0.00063 0.00062 0.00061
## Cumulative Proportion 0.99176 0.99242 0.99308 0.99373 0.99436 0.99498 0.99560
## PC169 PC170 PC171 PC172 PC173 PC174 PC175
## Standard deviation 0.5007 0.49704 0.49319 0.48546 0.47300 0.47136 0.46727
## Proportion of Variance 0.0006 0.00059 0.00058 0.00056 0.00053 0.00053 0.00052
## Cumulative Proportion 0.9962 0.99679 0.99737 0.99793 0.99846 0.99899 0.99952
## PC176 PC177
## Standard deviation 0.45009 1.669e-14
## Proportion of Variance 0.00048 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
#PCA - Gender
a<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$gender)
a + labs(title = "Gender")
#PCA - Race
b<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$race)
b + labs(title = "Race")
#PCA - Vital Status
c<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$vital_status)
c + labs(title = "Vital Staus")
#PCA - AJCC Pathologic M
d<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$ajcc_pathologic_m)
d + labs(title = "AJCC Pathologic M")
#PCA - AJCC Pathologic N
e<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$ajcc_pathologic_n)
e + labs(title = "AJCC Pathologic N")
#PCA - AJCC Pathologic T
f<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$ajcc_pathologic_t)
f + labs(title = "AJCC Pathologic T")
#PCA - AJCC Pathologic Stage
g<-ggbiplot(tcga.pca , var.axes = FALSE, groups=demoHeat$ajcc_pathologic_stage)
g + labs(title = "AJCC Pathologic Stage")
#Multiple PCA plots
ggarrange(a, b, c, ncol = 2, nrow = 2)
ggarrange(d,e,f,g, ncol = 2, nrow = 2)